This analysis is part of the capstone project for Google Data Analytics Certificate.
# Calling out the library
library(tidyverse)
library(lubridate)
library(ggplot2)
library(skimr)
library(janitor)
library(dplyr)
# Upload the Divvy datasets here (CSV formats)
apr_2020 <- read_csv("2020_04_tripdata.csv")
may_2020 <- read_csv("2020_05_tripdata.csv")
jun_2020 <- read_csv("2020_06_tripdata.csv")
jul_2020 <- read_csv("2020_07_tripdata.csv")
aug_2020 <- read_csv("2020_08_tripdata.csv")
sep_2020 <- read_csv("2020_09_tripdata.csv")
oct_2020 <- read_csv("2020_10_tripdata.csv")
nov_2020 <- read_csv("2020_11_tripdata.csv")
dec_2020 <- read_csv("2020_12_tripdata.csv")
jan_2021 <- read_csv("2021_01_tripdata.csv")
feb_2021 <- read_csv("2021_02_tripdata.csv")
mar_2021 <- read_csv("2021_03_tripdata.csv")
# Compare column names
colnames(apr_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "usertype"
colnames(oct_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
## [16] "month_year"
# Check apr 2020 specific column "usertype"
head(apr_2020$usertype)
## [1] "member" "member" "member" "member" "casual" "member"
# Check oct 2020 columns
head(oct_2020[c("member_casual","ride_length","day_of_week","month_year")])
# Rename the files
apr_2020 <- rename(apr_2020, member_casual = usertype)
may_2020 <- rename(may_2020, member_casual = usertype)
jun_2020 <- rename(jun_2020, member_casual = usertype)
jul_2020 <- rename(jul_2020, member_casual = usertype)
aug_2020 <- rename(aug_2020, member_casual = usertype)
sep_2020 <- rename(sep_2020, member_casual = usertype)
# Check to see the column is changed properly
str(apr_2020)
# Converting ride_length from time format to character type
# From Oct 2020 to Mar 2021
mar_2021 <- mutate(mar_2021, ride_length = as.character(ride_length))
feb_2021 <- mutate(feb_2021, ride_length = as.character(ride_length))
jan_2021 <- mutate(jan_2021, ride_length = as.character(ride_length))
dec_2020 <- mutate(dec_2020, ride_length = as.character(ride_length))
nov_2020 <- mutate(nov_2020, ride_length = as.character(ride_length))
oct_2020 <- mutate(oct_2020, ride_length = as.character(ride_length))
# Converting start_station_id and end_station_id from num to chr to allow combining
# From Apr 2020 to Nov 2020
apr_2020 <- mutate(apr_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
may_2020 <- mutate(may_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
jun_2020 <- mutate(jun_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
jul_2020 <- mutate(jul_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
aug_2020 <- mutate(aug_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
sep_2020 <- mutate(sep_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
oct_2020 <- mutate(oct_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
nov_2020 <- mutate(nov_2020, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
# Stack individual dataframes into one big data frame
all_trips <- bind_rows(apr_2020, may_2020, jun_2020, jul_2020, aug_2020, sep_2020, oct_2020, nov_2020, dec_2020, jan_2021, feb_2021, mar_2021)
# Inspect the new dataframe
str(all_trips)
## spec_tbl_df [3,489,748 x 16] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ride_id : chr [1:3489748] "A847FADBBC638E45" "5405B80E996FF60D" "5DD24A79A4E006F4" "2A59BBDF5CDBA725" ...
## $ rideable_type : chr [1:3489748] "docked_bike" "docked_bike" "docked_bike" "docked_bike" ...
## $ started_at : chr [1:3489748] "26/4/2020 17:45" "17/4/2020 17:08" "1/4/2020 17:54" "7/4/2020 12:50" ...
## $ ended_at : chr [1:3489748] "26/4/2020 18:12" "17/4/2020 17:17" "1/4/2020 18:08" "7/4/2020 13:02" ...
## $ start_station_name: chr [1:3489748] "Eckhart Park" "Drake Ave & Fullerton Ave" "McClurg Ct & Erie St" "California Ave & Division St" ...
## $ start_station_id : chr [1:3489748] "86" "503" "142" "216" ...
## $ end_station_name : chr [1:3489748] "Lincoln Ave & Diversey Pkwy" "Kosciuszko Park" "Indiana Ave & Roosevelt Rd" "Wood St & Augusta Blvd" ...
## $ end_station_id : chr [1:3489748] "152" "499" "255" "657" ...
## $ start_lat : num [1:3489748] 41.9 41.9 41.9 41.9 41.9 ...
## $ start_lng : num [1:3489748] -87.7 -87.7 -87.6 -87.7 -87.6 ...
## $ end_lat : num [1:3489748] 41.9 41.9 41.9 41.9 42 ...
## $ end_lng : num [1:3489748] -87.7 -87.7 -87.6 -87.7 -87.7 ...
## $ member_casual : chr [1:3489748] "member" "member" "member" "member" ...
## $ ride_length : chr [1:3489748] NA NA NA NA ...
## $ day_of_week : num [1:3489748] NA NA NA NA NA NA NA NA NA NA ...
## $ month_year : chr [1:3489748] NA NA NA NA ...
## - attr(*, "spec")=
## .. cols(
## .. ride_id = col_character(),
## .. rideable_type = col_character(),
## .. started_at = col_character(),
## .. ended_at = col_character(),
## .. start_station_name = col_character(),
## .. start_station_id = col_double(),
## .. end_station_name = col_character(),
## .. end_station_id = col_double(),
## .. start_lat = col_double(),
## .. start_lng = col_double(),
## .. end_lat = col_double(),
## .. end_lng = col_double(),
## .. usertype = col_character()
## .. )
# Remove columns that are not present in all the files: "ride_length", "day_of_week", "month_year"
all_trips <- select(all_trips, -c(ride_length, day_of_week, month_year))
# List of summary to get a better idea of the data
head(all_trips) # See the 6 rows of data frame.
str(all_trips) # See list of columns and data types (numeric, character, etc)
summary(all_trips) # Statistical summary of data. Mainly for numerics
skim_without_charts(all_trips) # Gives a detailed summary (Rows, columns, variable type)
# Check the number of observations before reassigning
table(all_trips$member_casual)
##
## casual customer member subscriber
## 1275658 154718 1871085 188287
# To replace the values in the column accordingly
all_trips <- mutate(all_trips, member_casual = recode(member_casual, "subscriber" = "member", "customer" = "casual"))
# Check to make sure the proper number of observations were reassigned
table(all_trips$member_casual)
##
## casual member
## 1430376 2059372
# Add in the date column by converting the start date into date format
all_trips$date <- as.POSIXct(all_trips$started_at, format = "%d/%m/%Y")
# Add in the month, day, year and which day column
all_trips$month <- format(all_trips$date,format = "%m")
all_trips$day <- format(all_trips$date,format = "%d")
all_trips$year <- format(all_trips$date,format = "%Y")
all_trips$day_of_week <- format(all_trips$date,format = "%A")
# Viewing the data
head(all_trips)
# Convert the started_at and ended_at into datetime format for the calculation
all_trips$started_at <- as.POSIXct(all_trips$started_at, format = "%d/%m/%Y %H:%M")
all_trips$ended_at <- as.POSIXct(all_trips$ended_at, format = "%d/%m/%Y %H:%M")
# Calculate the ride length by deducting the start time from end time
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at, units = "auto")
# Convert "ride_length" from factor to numeric to run calculations
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
# Quick check on the output
max(all_trips$ride_length)
## [1] 3523200
min(all_trips$ride_length)
## [1] -1743000
# Inspect the ride lengths that are negative
all_trips %>%
select(started_at,ended_at,ride_length) %>%
filter(ride_length < 0)
# Total number of rows
nrow(all_trips)
## [1] 3489748
# Removing the data where ride_length was negative and store into a dataframe
all_trips_v2 <- all_trips[!(all_trips$ride_length < 0),]
# Inspect the new dataframe for negative values in ride length
all_trips_v2 %>%
select(started_at,ended_at,ride_length) %>%
filter(ride_length < 0)
# Descriptive analysis on ride_length (all figures in seconds)
mean(all_trips_v2$ride_length)#straight average (total ride length/ rides)
## [1] 1673.579
median(all_trips_v2$ride_length) #midpoint number in the ascending array of ride lengths
## [1] 900
max(all_trips_v2$ride_length)#longest ride
## [1] 3523200
min(all_trips_v2$ride_length)#shortest ride
## [1] 0
# Summary of different stats
summary(all_trips_v2$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 480 900 1674 1620 3523200
# Comparing members and casual users using different stats
aggregate(all_trips_v2$ride_length~all_trips_v2$member_casual, FUN = mean)
aggregate(all_trips_v2$ride_length~all_trips_v2$member_casual, FUN = median)
aggregate(all_trips_v2$ride_length~all_trips_v2$member_casual, FUN = max)
aggregate(all_trips_v2$ride_length~all_trips_v2$member_casual, FUN = min)
# Generating the average ride time by each day for members vs casual users
aggregate(all_trips_v2$ride_length~all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
# Sort the days of the weeks into the correct order (Sunday until Saturday)
all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels = c("Sunday","Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday"))
# Generating the average ride time by each day for members vs casual users (After sorting the days)
aggregate(all_trips_v2$ride_length~all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% # Creates temporary weekday field using wday()
group_by(member_casual, weekday) %>% # Groups by usertype and weekday
summarise(number_of_rides = n() # Calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # Calculates the average duration
arrange(member_casual, weekday) # Sorts by member and by the weekday
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
labs(x="Day of the week", y="Number of rides took", title="Number of rides (Casual vs Member)")
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
labs(x="Day of the week", y="Number of rides took", title="Number of rides (Casual vs Member)") +
facet_wrap(~member_casual)
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge") +
labs(x="Day of the week", y="Average ride length (in sec)", title="Average ride length (Casual vs Member)")
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,median_duration = median(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = median_duration, fill = member_casual)) +
geom_col(position = "dodge") +
labs(x="Day of the week", y="Median ride length (in sec)", title="Median length of ride (Casual vs Member)")